import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
### gensim, LDA model, stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as swords
import pyLDAvis.gensim
#nltk Lemmatize and stemmer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import *
np.random.seed(2018)
#nltk.download('wordnet')
#nltk.download('stopwords')
from nltk.corpus import stopwords as st
from nltk.corpus import wordnet
#nltk.download('averaged_perceptron_tagger')
### LDA visualization
import pyLDAvis
import pyLDAvis.gensim
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from sklearn.manifold import TSNE
import plotly as py
import plotly.graph_objects as go
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix
from sklearn.decomposition import PCA
import seaborn as sns
from pyclustertend import hopkins
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
Text data has been a recent challenge for data scientists to research and analyze given its unstructured format and non-numeric feature. It is quite different from the typical numeric or categorical features we encounter more often in data analysis task. The dataset we picked is a merged data source from three data sources, because we want to ensure there are three big topics, that are somewhat different, in order for our clustering algorithm to explore.
[Note]: Alought our dataset only contains two columns, it is confirmed by instructor that it is ok to use, because text data are usually harder to clean and will eventual result in more columns.
1. Kaggle Wine Review (400rows): Containing review written by professional sommelier about wines
2. Kaggle Coronavirus_Tweet (400rows): Containing real posts from Twitter by real human users discussing coronavirus
3. Kaggle Disater Tweet(400rows): Containing real posts from Twitter by real human users discussing disasters
Miglani, Aman. “Coronavirus Tweets NLP - Text Classification.” Kaggle, 8 Sept. 2020, www.kaggle.com/datatattle/covid-19-nlp-text-classification.
Zackthoutt. “Wine Reviews.” Kaggle, 27 Nov. 2017,Kaggle, www.kaggle.com/zynicide/wine-reviews.
“Natural Language Processing with Disaster Tweets.” Kaggle, www.kaggle.com/c/nlp-getting-started/data.
Blei, David M., et al. “Latent Dirichlet Allocation.” Journal of Machine Learning Research, 1 Jan. 2003, https://jmlr.org/papers/volume3/blei03a/blei03a.pdf.
my_data = pd.read_csv('Text_Data_3_Source.csv', error_bad_lines=False)
for i in range(0,5): print(my_data.iloc[i,0])
print(my_data.shape)
data = my_data['text_content'].values.tolist()
# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", '', sent) for sent in data]
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])
eg: univeristy of illinois appears 500 times in our data, 'university of illinois' should be consider as one word.
# Build the bigram and trigram models
### min_count: words must appear together 5 times in order to form a phrase, otherwise ignored.
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[data_words[1]]])
stop_words = st.words('english')
### extend some common case of stopwords, and stopwords from gensim package to nltk stopwords library
stop_words.extend(['from', 're'] + list(swords))
# first 5 examples
stop_words[0:5]
ex: boys -> boy, playting -> play, played -> play
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def get_wordnet_pos(word):
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def lemmatize(texts):
return [[WordNetLemmatizer().lemmatize(word, pos=get_wordnet_pos(word)) for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
## Apply those function to our data
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_trigrams = make_trigrams(data_words_nostops)
# lemmatize
data_lemmatized = lemmatize(data_words_trigrams)
print(data_lemmatized[:1])
## quick example of lemmatize
WordNetLemmatizer().lemmatize('forgive', pos=get_wordnet_pos('forgave'))
Due to text data type which is very special, in this part, we only show top 50 words for merged data, top 10 words for each topic and the number of words in each row appearing in reviews aftering text preprocessing.
# labels
disater = data_lemmatized[0:400]
covid = data_lemmatized[400:800]
wine = data_lemmatized[800:]
#covid = data_lemmatized[0:400]
#wine = data_lemmatized[400:]
labels = []
for i in range(0,1200):
if i<400: labels.append(0)
elif i>=800: labels.append(2)
else: labels.append(1)
row_count = []
for lst in range(0, len(data_lemmatized)):
row_count.append(len(data_lemmatized[lst]))
row_count = np.array(row_count)
plt.hist(row_count, bins = 20, color = 'c', edgecolor='k')
plt.axvline(row_count.mean(), color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(row_count.mean()*1.1, max_ylim*0.9, 'Mean: {:.2f}'.format(row_count.mean()))
plt.title('Distribution of Word Count in Each Row ')
plt.xlabel('#words per row')
plt.ylabel('#documents')
plt.show()
def top_words(data, title, i):
x = pd.Series(data)
df = x.apply(pd.Series).stack().reset_index(drop = True)
all_words = df.value_counts()
freq_word = [go.Bar(
x = all_words.index.values[1:i+2],
y = all_words.values[1:i+2],
marker= dict(colorscale='Jet',
color = all_words.values[1:100]
),
text='Word counts'
)]
layout = go.Layout(
title= title
)
fig = go.Figure(data=freq_word, layout=layout)
fig.show()
top_words(data = data_lemmatized, title='Top Words Frequencies', i=50)
top_words(data =disater, title='Disaster: Top 10 Words Frequencies ', i=10)
top_words(data =covid, title='Covid: Top 10 Words Frequencies', i=10)
top_words(data =wine, title='Wine: Top 10 Words Frequencies', i=10)
n_words = []
for i in data_lemmatized:
n_words.append(len(i))
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(range(0,1200),n_words)
plt.xlabel('#row_id')
plt.ylabel('#words')
plt.show()
In this part, we use TF-IDF(X1).
documents = []
for i in data_lemmatized: documents.append(" ".join(i))
# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X1 = vectorizer.fit_transform(documents) # sparse matrix
We use PCA method to reduce the sparse matrix dimensionality to 50D and then apply hopkins method to if the data is clusterable. Hopkins statistics are really near to 0, so our data is clusterable.
X1_pca_50 = PCA(n_components=50, random_state=430).fit_transform(X1.todense())
num_trials=5
hopkins_stats=[]
for i in range(0,num_trials):
n = len(X1_pca_50)
p = int(0.1 * n)
hopkins_stats.append(hopkins(X1_pca_50, p))
print(hopkins_stats)
# TSNE
tsne_tf = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X1.todense()))
sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1], hue=labels)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('TSNE Plot')
plt.show()
Hard to describe, because this is text data, not numeric data.
Yes, from the plot, the clusters are indeed balanced in size. Furthermore, we intentionally make it balance - 400 rows from each data source to ensure balanced cluster.
From the plot, there is overlap in two of the clusters - probably coronavirus tweet and disaster tweet. Coronavirus tweets may have something in common with disaster tweets, given coronavirus is also a disaster.
The intial motivation of this analysis is to evaluate
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
#Corpus
texts = data_lemmatized
# Term Document Frequency (bag of words)
corpus = [id2word.doc2bow(text) for text in texts]
print('LDA Model-read version example:',corpus[:1])
print('human-read version example:',[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])
# How many clustering(topics) we should expect? - Let's use coherence score as our LDA-elbow method to tell us
def compute_coherence_values(dictionary, corpus, texts, limit, start=1, step=1):
coherence_values = []
model_list = []
perplexity = []
for num_topics in range(start, limit, step):
model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=430,
update_every=1,
passes=20,
alpha='auto',
per_word_topics=True)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
perplexity.append(model.log_perplexity(corpus))
return model_list, coherence_values,perplexity
model_list, coherence_values, perplexity = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=1, limit=6, step=1)
###### Show graph
limit=6; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
###### Show graph
limit=6; start=1; step=1;
x = range(start, limit, step)
plt.plot(x, perplexity)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity")
plt.legend(("Perplexity"), loc='best')
plt.show()
Coherence score is high at #topic = 2 and 3, but perplexity at #topic = 2 is high, thus we will pick #topic = 3
LDA_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=3,
random_state=123,
update_every=1,
passes=80,
alpha='auto',
per_word_topics=True)
LDA_model.print_topics()
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS'
cloud = WordCloud(stopwords=stop_words,
background_color='white',
width=2500,
height=1800,
max_words=10,
colormap='tab10',
color_func=lambda *args, **kwargs: cols[i],
prefer_horizontal=1.0)
topics = LDA_model.show_topics(formatted=False)
fig, axes = plt.subplots(1, 3, figsize=(10,10), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
fig.add_subplot(ax)
topic_words = dict(topics[i][1])
cloud.generate_from_frequencies(topic_words, max_font_size=300)
plt.gca().imshow(cloud)
plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(LDA_model, corpus, dictionary=LDA_model.id2word)
vis
def format_topics_sentences(ldamodel=None, corpus=corpus):
sent_topics_df = pd.DataFrame()
for i, row_list in enumerate(ldamodel[corpus]):
row = row_list[0] if ldamodel.per_word_topics else row_list
# print(row)
row = sorted(row, key=lambda x: (x[1]), reverse=True)
# Get the Dominant topic, Perc Contribution and Keywords for each document
for j, (topic_num, prop_topic) in enumerate(row):
if j == 0: # => dominant topic
wp = ldamodel.show_topic(topic_num)
topic_keywords = ", ".join([word for word, prop in wp])
sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
else:
break
sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=LDA_model, corpus=corpus)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',]
df_dominant_topic.head(5)
# elbow plot
cluster_num_list=range(1,6)
avg_inertia_list=[]
for k in cluster_num_list:
sub_inertia_list=[]
for i in range(0,3):
kmeans=KMeans(n_clusters=k, init='k-means++',).fit(X1)
sub_inertia_list.append(kmeans.inertia_)
avg_inertia_list.append(np.average(sub_inertia_list))
#Plot it
plt.plot(cluster_num_list,avg_inertia_list)
plt.xlabel('Number of Clusters Requested in K-means')
plt.ylabel('Average Inertia of the K-Means Results (3 trials)')
plt.title('Elbow Method Results')
plt.show()
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = 1000).fit(X1)
pred_tfidf = kmeans.labels_
# TSNE
tsne_tf = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X1.todense()))
ax = sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=pred_tfidf)
ax.set_title('TSNE Plot Colored by Predictions')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
print("Top terms per cluster:")
for i in range(k):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print("\n")
vectorizer2 = CountVectorizer(strip_accents='unicode', stop_words='english')
X2 = vectorizer2.fit_transform(documents)
X2_new = PCA(n_components=50, random_state=430).fit_transform(X2.todense())
# elbow plot
cluster_num_list=range(1,6)
avg_inertia_list=[]
for k in cluster_num_list:
sub_inertia_list=[]
for i in range(0,3):
kmeans=KMeans(n_clusters=k, init='k-means++',).fit(X2)
sub_inertia_list.append(kmeans.inertia_)
avg_inertia_list.append(np.average(sub_inertia_list))
k = 3
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = 430)
kmeans.fit(X2)
pred_bw = kmeans.labels_
# TSNE
tsne_bw = TSNE(random_state=430).fit_transform(PCA(n_components=50, random_state=430).fit_transform(X2.todense()))
ax = sns.scatterplot(x=tsne_bw[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=pred_bw)
ax.set_title('TSNE Plot Colored by Predictions')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
ax = sns.scatterplot(x=tsne_bw[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=labels)
ax.set_title('TSNE Plot Colored by Real labels')
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer2.get_feature_names()
print("Top terms per cluster:")
for i in range(k):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print("\n")
print('kmeans-TF_IDF',silhouette_score(X1, pred_tfidf))
print('kmeans-Bag of Words',silhouette_score(X1, pred_bw))
#0: random labeling independently of the number of clusters and samples
#1: clusterings are identical
print('Bag-of-words LDA by dominant topic',adjusted_rand_score(df_dominant_topic['Dominant_Topic'], labels))
print('kmeans-TF_IDF',adjusted_rand_score(pred_tfidf, labels))
print('kmeans-Bag of Words',adjusted_rand_score(pred_bw, labels))
# each cluster contains only members of a single class
print('Bag-of-words LDA by dominant topic',homogeneity_score(df_dominant_topic['Dominant_Topic'], label))
print('kmeans-TF_IDF',homogeneity_score(pred_tfidf, labels))
print('kmeans-Bag of Words',homogeneity_score(pred_bw, labels))
# all members of a given class are assigned to the same cluster
print('Bag-of-words LDA by dominant topic',completeness_score(df_dominant_topic['Dominant_Topic'], labels))
print('kmeans-TF_IDF',completeness_score(pred_tfidf, labels))
print('kmeans-Bag of Words',completeness_score(pred_bw, labels))
# LDA
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=np.array(df_dominant_topic['Dominant_Topic'].astype(int)), style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
# kmeans-TF_IDF
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
sns.scatterplot(x=tsne_tf[:,0],y=tsne_tf[:,1],sizes=(30, 400), hue=pred_tfidf, style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
# kmeans-Bag of Words
fig, ax = plt.subplots(figsize=(18, 11))
with sns.plotting_context("notebook", font_scale=1.5):
sns.scatterplot(x=tsne_tf[:,0],y=tsne_bw[:,1],sizes=(30, 400), hue=pred_bw, style=labels)
ax.set_xlabel(r'$x$')
ax.set_ylabel(r'$y$')
plt.show()
import base64, io, IPython
from PIL import Image as PILImage
image = PILImage.open('1.png')
output = io.BytesIO()
image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()
html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)
LDA is a soft clustering, in order to make the comparison at hard assignment level, here we take a dominant topic (the cluster with highest probability) in order to form the comparison.
In summary, LDA is good at recognizing the topics discussed in the documents, and provide good performance with simple text convertion technique like bag-of-words, K-means can give good performance when using TF-IDF, when using bag-of-words, the result is less interpretable than LDA.